Conclusions at the bottom of the page

# List of required packages
libraries <- c(
  "haven", "dplyr", "tidyr", "ggplot2", "reshape2",
  "viridis", "knitr", "kableExtra"
)

# Install missing packages
for (pkg in libraries) {
  if (!requireNamespace(pkg, quietly = TRUE)) {
    install.packages(pkg)
  }
}

# Load all packages
invisible(lapply(libraries, library, character.only = TRUE))
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
## Loading required package: viridisLite
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
dat <- read_sas("alzheimer25.sas7bdat")
str(dat)
## tibble [1,253 × 38] (S3: tbl_df/tbl/data.frame)
##  $ patid  : num [1:1253] 10001 10002 10003 10004 10005 ...
##  $ trial  : num [1:1253] 1 1 1 1 1 1 1 1 1 1 ...
##  $ sex    : num [1:1253] 0 0 0 1 1 1 0 0 0 1 ...
##  $ age    : num [1:1253] 72 74 74 75 71 72 75 76 68 73 ...
##  $ edu    : num [1:1253] 3 3 1 3 2 3 1 4 3 4 ...
##  $ bmi    : num [1:1253] 26.4 27.1 27.3 24.8 28.2 ...
##  $ inkomen: num [1:1253] 1900 1900 1000 2500 2000 2300 1300 2300 1900 3000 ...
##  $ job    : num [1:1253] 0 0 0 0 0 0 0 0 1 0 ...
##  $ adl    : num [1:1253] 6 5 6 5 6 6 6 6 14 6 ...
##  $ wzc    : num [1:1253] 0 0 0 1 1 1 0 0 0 1 ...
##  $ cdrsb0 : num [1:1253] 4 1 2 1 19 1 19 19 7 4 ...
##  $ cdrsb1 : num [1:1253] 1 3 1 3 11 2 6 12 6 1 ...
##  $ cdrsb2 : num [1:1253] 1 3 12 6 12 4 4 15 8 6 ...
##  $ cdrsb3 : num [1:1253] 6 8 9 7 10 5 11 8 10 7 ...
##  $ cdrsb4 : num [1:1253] 2 NA 8 NA NA 6 6 4 3 10 ...
##  $ cdrsb5 : num [1:1253] 16 NA 13 NA NA NA 2 2 19 NA ...
##  $ cdrsb6 : num [1:1253] NA NA 19 NA NA NA NA NA 1 NA ...
##  $ bprs0  : num [1:1253] 72 77 79 81 72 76 78 80 62 80 ...
##  $ bprs1  : num [1:1253] 79 83 85 90 82 85 84 88 72 87 ...
##  $ bprs2  : num [1:1253] 86 91 92 95 84 91 92 93 80 93 ...
##  $ bprs3  : num [1:1253] 95 98 103 108 92 97 101 103 89 98 ...
##  $ bprs4  : num [1:1253] 97 NA 108 NA NA 108 102 109 99 104 ...
##  $ bprs5  : num [1:1253] 108 NA 118 NA NA NA 111 113 102 NA ...
##  $ bprs6  : num [1:1253] NA NA 123 NA NA NA NA NA 112 NA ...
##  $ abpet0 : num [1:1253] 2 2 2 2.06 2 ...
##  $ abpet1 : num [1:1253] 2 2.02 2.03 2.94 2 ...
##  $ abpet2 : num [1:1253] 2 2.23 2.61 3 2 ...
##  $ abpet3 : num [1:1253] 2 2.97 2.98 3 2 ...
##  $ abpet4 : num [1:1253] 2 NA 3 NA NA ...
##  $ abpet5 : num [1:1253] 2.04 NA 3 NA NA ...
##  $ abpet6 : num [1:1253] NA NA 3 NA NA NA NA NA 2 NA ...
##  $ taupet0: num [1:1253] 1.9 1.9 1.9 1.9 1.9 ...
##  $ taupet1: num [1:1253] 1.9 1.9 1.9 1.9 1.9 ...
##  $ taupet2: num [1:1253] 1.9 1.9 1.9 1.9 1.9 ...
##  $ taupet3: num [1:1253] 1.9 1.9 1.9 1.9 1.9 ...
##  $ taupet4: num [1:1253] 1.9 NA 1.9 NA NA ...
##  $ taupet5: num [1:1253] 1.9 NA 2.03 NA NA ...
##  $ taupet6: num [1:1253] NA NA 2.79 NA NA ...
##  - attr(*, "label")= chr "alzheimer25 dataset written by Stat/Transfer Ver. 15.1.1403.1030"
colSums(is.na(dat))
##   patid   trial     sex     age     edu     bmi inkomen     job     adl     wzc 
##       0       0       0       0       0       0       0       0       0       0 
##  cdrsb0  cdrsb1  cdrsb2  cdrsb3  cdrsb4  cdrsb5  cdrsb6   bprs0   bprs1   bprs2 
##       0     147     239     346     476     601     742       0     147     239 
##   bprs3   bprs4   bprs5   bprs6  abpet0  abpet1  abpet2  abpet3  abpet4  abpet5 
##     346     476     601     742       0     147     239     346     476     601 
##  abpet6 taupet0 taupet1 taupet2 taupet3 taupet4 taupet5 taupet6 
##     742       0     147     239     346     476     601     742

Data Preparation

reshape_dat <- function(dat) {
    dat_long <- dat %>%
        pivot_longer(
            cols = matches("^(bprs|cdrsb|abpet|taupet)\\d+"),
            names_to = c(".value", "time"),
            names_pattern = "(.+)(\\d+)"
        )
    dat_long
}

to_factor <- function(dat_long) {

    dat_long <- dat_long %>%
        mutate(
            time = as.factor(time),  # coded 0-6
            sex = as.factor(sex),
            edu = as.factor(edu),
            trial = as.factor(trial),
            job = as.factor(job),
            wzc = as.factor(wzc),
            time_num = as.numeric(time)  # coded 1-7
        )
    dat_long
}

dat_long <- reshape_dat(dat)
dat_long <- to_factor(dat_long)

EDA

Mean and SD of clinical variables as a function of time

plot_df <- dat_long %>%
  pivot_longer(
    cols = c(bprs, cdrsb, abpet, taupet),
    names_to = "variable",
    values_to = "value"
  ) %>%
  rename(year = time_num) 


mean_var_summary <- plot_df %>%
  group_by(variable, year) %>%
  summarise(
    mean_value = mean(value, na.rm = TRUE),
    var_value  = var(value, na.rm = TRUE),
    sd_value   = sd(value, na.rm = TRUE),
    n          = sum(!is.na(value)),
    se_value   = sd_value / sqrt(n),
    .groups = "drop"
  )

ggplot(mean_var_summary, aes(x = year, y = mean_value)) +
  geom_line(color = "firebrick", linewidth = 1.2) +
  geom_point(color = "firebrick", size = 2) +
  geom_ribbon(aes(ymin = mean_value - sd_value,
                  ymax = mean_value + sd_value),
              fill = "firebrick", alpha = 0.2) +
  facet_wrap(~ variable, scales = "free_y", ncol = 2) +
  labs(
    title = "Mean ± SD per variable across years",
    x = "Year", y = "Mean ± SD"
  ) +
  theme_minimal(base_size = 14) +
  theme(strip.text = element_text(face = "bold"))

Correlation matrix of continuous features at baseline

M <- cor(dat[, c("cdrsb0", "bprs0", "abpet0", "taupet0",
                 "inkomen", "bmi", "job", "trial", "adl", "age", "wzc")],
         use = "pairwise.complete.obs")


melted_M <- reshape2::melt(M)


ord <- hclust(dist(M))$order
melted_M$Var1 <- factor(melted_M$Var1, levels = rownames(M)[ord])
melted_M$Var2 <- factor(melted_M$Var2, levels = colnames(M)[ord])


ggplot(melted_M, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile(color = "white", linewidth = 0.4) +  # thin white grid lines
  scale_fill_gradient2(
    low = "#4575b4", mid = "white", high = "#d73027", midpoint = 0,
    limits = c(-1, 1),
    name = "Correlation (-1 : 1)"
  ) +
  coord_fixed() + 
  labs(
    title = "Correlation Heatmap of Continuous Features at Baseline ",
    x = NULL, y = NULL
  ) +
  theme_minimal(base_size = 14) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 11, face = "bold"),
    axis.text.y = element_text(size = 11, face = "bold"),
    plot.title = element_text(size = 18, face = "bold", hjust = 0.5),
    plot.subtitle = element_text(size = 13, hjust = 0.5, color = "gray30"),
    panel.grid = element_blank(),
    legend.position = "right",
    legend.title = element_text(size = 12, face = "bold"),
    legend.text = element_text(size = 10)
  )

Correlation matrix of bprs values

desired_order <- c("bprs0","bprs1","bprs2","bprs3","bprs4","bprs5","bprs6")

M <- cor(dat[, desired_order],
         use = "pairwise.complete.obs")


melted_M <- reshape2::melt(M)

melted_M$Var1 <- factor(melted_M$Var1, levels = desired_order)
melted_M$Var2 <- factor(melted_M$Var2, levels = desired_order)


ggplot(melted_M, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile(color = "white", linewidth = 0.4) +  # thin white grid lines
  scale_fill_gradient2(
    low = "#4575b4", mid = "white", high = "#d73027", midpoint = 0.9,
    limits = c(0.8, 1),
    name = "Correlation (0.8 : 1)"
  ) +
  coord_fixed() + 
  labs(
    title = "Correlation Heatmap of BPRS scores",
    x = NULL, y = NULL
  )

Distribution of Age at baseline by sex

ggplot(dat, aes(x = age, fill = as.factor(sex))) +
  geom_density(alpha = 0.5, color = "black", linewidth = 0.4) +
  scale_fill_manual(
    values = c("#4575b4", "#d73027"), 
    labels = c('Male', 'Female'),
    name = 'Sex'
  ) +
  labs(
    title = "Age Distribution at baseline by Sex",
    x = "Age",
    y = "Density"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(size = 18, face = "bold", hjust = 0.5),
    legend.position = "top",
    legend.title = element_text(size = 12, face = "bold"),
    legend.text = element_text(size = 11),
    axis.text = element_text(size = 11),
    panel.grid.minor = element_blank()
  )

EDA by sex

dat <- dat %>%
  mutate(
    sex = factor(sex, labels = c("Male", "Female")),
    wzc = factor(wzc, labels = c("Other", "Nursing Home Resident"))
  )

dat_long <- dat_long %>%
  mutate(
    sex = factor(sex, labels = c("Male", "Female")),
    wzc = factor(wzc, labels = c("Other", "Nursing Home Resident"))
  )

# Individual points
ggplot(dat_long, aes(x = time, y = bprs, color = sex)) + 
  geom_jitter(alpha = 0.6) + 
  scale_color_manual(values = c("#4575b4", "#d73027")) +
  theme_minimal() + 
  labs(
    title = 'Individual BPRS Score by Time and Sex',
    x = 'Year',
    y = 'BPRS Score'
  )
## Warning: Removed 2551 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Individual trajectories
ggplot(dat_long, aes(x = time, y = bprs, color = sex, group = patid)) + 
  geom_line(alpha = 0.4) +
  geom_point(alpha = 0.6) +
  scale_color_manual(values = c("#4575b4", "#d73027")) +
  theme_minimal() + 
  labs(
    title = 'Individual BPRS Score Profiles by Time and Sex',
    x = 'Year',
    y = 'BPRS Score'
  )
## Warning: Removed 2551 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Removed 2551 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Average profile ± SE by sex
ggplot(dat_long, aes(x = time, y = bprs, color = sex, group = sex)) + 
  stat_summary(fun = mean, geom = "line", size = 1) + 
  stat_summary(fun.data = mean_se, geom = "errorbar", width = 0.3) +
  scale_color_manual(values = c("#4575b4", "#d73027")) +
  theme_minimal() + 
  labs(
    title = 'Average BPRS Score by Time and Sex',
    x = 'Year',
    y = 'BPRS Score'
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 2551 rows containing non-finite outside the scale range
## (`stat_summary()`).
## Removed 2551 rows containing non-finite outside the scale range
## (`stat_summary()`).

# Boxplot of age at baseline
ggplot(dat, aes(x = sex, y = age, fill = sex)) +
  geom_boxplot(alpha = 0.6, color = "black") +
  scale_fill_manual(values = c("#4575b4", "#d73027")) +
  theme_minimal(base_size = 14) +
  labs(title = "Age at baseline by Sex", x = "Sex", y = "Age") +
  theme(legend.position = "none")

alive_at_year6 <- dat %>%
  filter(!is.na(bprs6)) %>%
  mutate(age_year6 = age + 6)

# Boxplot of age at year 6
ggplot(alive_at_year6, aes(x = sex, y = age_year6, fill = sex)) +
  geom_boxplot(alpha = 0.6, color = "black") +
  scale_fill_manual(values = c("#4575b4", "#d73027")) +
  theme_minimal(base_size = 14) +
  labs(
    title = "Age at Year 6 by Sex (Only Participants with BPRS6 Available)",
    x = "Sex",
    y = "Age at Year 6"
  ) +
  theme(legend.position = "none")

# Dropouts by sex

dropouts_sex <- dat %>% 
  group_by(sex) %>%
  summarise(across(starts_with("bprs"), ~ sum(!is.na(.x)))) %>%
  ungroup()

dropouts_sex %>%
  kable(caption = "Number of Patients by Sex at Each Measurement Year") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Number of Patients by Sex at Each Measurement Year
sex bprs0 bprs1 bprs2 bprs3 bprs4 bprs5 bprs6
Male 616 583 551 506 463 409 341
Female 637 523 463 401 314 243 170

EDA by age

# Individuals points 
ggplot(dat_long, aes(x = time, y = bprs, color = age)) + 
  geom_jitter(alpha = 0.6) +
  scale_color_viridis_c(name = "Age") +
  theme_minimal() + 
  labs(
    title = 'Individual BPRS Scores by Time and Age',
    x = 'Year',
    y = 'BPRS Score'
  )
## Warning: Removed 2551 rows containing missing values or values outside the scale range
## (`geom_point()`).

set.seed(123) 

sample_ids <- dat_long %>%
  distinct(patid) %>%
  sample_n(150) %>%
  pull(patid)

dat_long_sample <- dat_long %>%
  filter(patid %in% sample_ids)

# Individual trajectories
ggplot(dat_long_sample, aes(x = time, y = bprs, group = patid, color = age)) +
  geom_line(alpha = 0.5, linewidth = 0.8) +
  scale_color_viridis_c(name = "Age") +
  theme_minimal() +
  labs(
    title = "Individual BPRS Profiles by Time and Age (Sample of 150 Patients)",
    x = "Year",
    y = "BPRS Score"
  )
## Warning: Removed 355 rows containing missing values or values outside the scale range
## (`geom_line()`).

# Dropouts by age

dropouts_age <- dat %>%
  group_by(age) %>%
  summarise(across(starts_with("bprs"), ~ sum(!is.na(.x)))) %>%
  ungroup()

dropouts_age %>%
  kable(caption = "Number of Patients by Age at Each Measurement Year") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Number of Patients by Age at Each Measurement Year
age bprs0 bprs1 bprs2 bprs3 bprs4 bprs5 bprs6
46 1 1 1 1 1 1 1
51 1 1 1 1 1 1 1
52 2 2 2 2 2 2 2
53 2 2 2 2 2 2 2
54 4 4 4 4 4 4 4
55 4 4 4 4 4 4 4
56 3 3 3 3 3 3 3
57 10 10 10 10 10 10 10
58 10 10 10 10 10 10 10
59 15 15 15 15 15 15 15
60 10 10 10 10 10 10 10
61 17 17 17 17 17 17 17
62 31 31 31 31 31 31 31
63 27 27 27 27 27 27 26
64 37 37 37 37 37 37 35
65 35 35 35 34 34 31 27
66 49 49 49 49 46 45 39
67 60 60 60 59 58 53 42
68 58 57 57 54 50 44 37
69 56 56 56 53 49 41 33
70 61 61 59 55 53 49 40
71 68 68 68 61 50 38 26
72 84 82 77 71 58 42 27
73 64 60 55 44 38 30 20
74 51 50 48 40 28 24 13
75 66 63 51 44 36 23 9
76 61 52 50 45 33 23 10
77 65 57 50 43 29 18 8
78 34 29 25 16 10 7 3
79 45 37 32 26 15 4 2
80 44 30 19 15 10 3 1
81 34 27 16 7 0 0 0
82 33 16 11 9 2 0 0
83 23 15 10 5 2 1 1
84 24 10 4 1 0 0 0
85 19 9 4 0 0 0 0
86 12 5 4 2 2 2 2
87 14 3 0 0 0 0 0
88 6 0 0 0 0 0 0
89 3 0 0 0 0 0 0
90 3 1 0 0 0 0 0
91 2 0 0 0 0 0 0
92 2 0 0 0 0 0 0
93 2 0 0 0 0 0 0
94 1 0 0 0 0 0 0

EDA by wzc

# Individual points
ggplot(dat_long, aes(x = time, y = bprs, color = wzc)) + 
  geom_jitter(alpha = 0.6) + 
  scale_color_manual(values = c("#4575b4", "#d73027")) +
  theme_minimal() + 
  labs(
    title = 'Individual BPRS Scores by Time and WZC',
    x = 'Year',
    y = 'BPRS Score'
  )
## Warning: Removed 2551 rows containing missing values or values outside the scale range
## (`geom_point()`).

set.seed(123) 

sample_ids <- dat_long %>%
  distinct(patid) %>%
  sample_n(150) %>%
  pull(patid)

dat_long_sample <- dat_long %>%
  filter(patid %in% sample_ids)

# Individual trajectories
ggplot(dat_long, aes(x = time, y = bprs, color = wzc, group = patid)) + 
  geom_line(alpha = 0.4) +
  geom_point(alpha = 0.6) +
  scale_color_manual(values = c("#4575b4", "#d73027")) +
  theme_minimal() + 
  labs(
    title = 'Individual BPRS Profiles by Time by Sex',
    x = 'Year',
    y = 'BPRS Score'
  )
## Warning: Removed 2551 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Removed 2551 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Average profile ± SE by WZC
ggplot(dat_long, aes(x = time, y = bprs, color = wzc, group = wzc)) + 
  stat_summary(fun = mean, geom = "line", size = 1) + 
  stat_summary(fun.data = mean_se, geom = "errorbar", width = 0.3) +
  scale_color_manual(values = c("#4575b4", "#d73027")) +
  theme_minimal() + 
  labs(
    title = 'Average BPRS Score by Time and WZC',
    x = 'Year',
    y = 'BPRS Score'
  )
## Warning: Removed 2551 rows containing non-finite outside the scale range
## (`stat_summary()`).
## Warning: Removed 2551 rows containing non-finite outside the scale range
## (`stat_summary()`).

ggplot(dat, aes(x = wzc, y = age, fill = wzc)) +
  geom_boxplot(alpha = 0.6, color = "black") +
  scale_fill_manual(values = c("#4575b4", "#d73027")) +
  theme_minimal(base_size = 14) +
  labs(title = "Age at baseline by WZC", x = "WZC", y = "Age") +
  theme(legend.position = "none")

dropouts_wzc <- dat %>%
  group_by(wzc) %>%
  summarise(across(starts_with("bprs"), ~ sum(!is.na(.x)))) %>%
  ungroup()

dropouts_wzc %>%
  kable(caption = "Number of Patients by WZC at Each Measurement Year") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Number of Patients by WZC at Each Measurement Year
wzc bprs0 bprs1 bprs2 bprs3 bprs4 bprs5 bprs6
Other 766 739 721 686 626 559 465
Nursing Home Resident 487 367 293 221 151 93 46

EDA by trial

# Individuals points 
ggplot(dat_long, aes(x = time, y = bprs, color = as.factor(trial))) + 
  geom_jitter(alpha = 0.6) +
  scale_fill_manual(
    values = grDevices::rainbow(length(unique(dat$trial))),
    name = "Trial"
  ) +
  theme_minimal() + 
  labs(
    title = 'Individual BPRS Scores as by Time and Trial',
    x = 'Year',
    y = 'BPRS Score'
  )
## Warning: Removed 2551 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Individual trajectories
ggplot(dat_long_sample, aes(x = time, y = bprs, group = patid, color = as.factor(trial))) +
  geom_line(alpha = 0.5, linewidth = 0.8) +
  scale_fill_manual(
    values = grDevices::rainbow(length(unique(dat$trial))),
    name = "Trial"
  ) +
  theme_minimal() +
  labs(
    title = "Individual BPRS Profiles by Time and Trial (Sample of 150 Patients)",
    x = "Year",
    y = "BPRS Score"
  )
## Warning: Removed 355 rows containing missing values or values outside the scale range
## (`geom_line()`).

ggplot(dat, aes(x = as.factor(trial), y = age, fill = as.factor(trial))) +
  geom_boxplot(alpha = 0.7, color = "black") +
  scale_fill_manual(
    values = grDevices::rainbow(length(unique(dat$trial))),
    name = "Trial"
  ) +
  theme_minimal(base_size = 14) +
  labs(title = "Age at baseline by Trial", x = "Trial", y = "Age") +
  theme(
    legend.position = "none",
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

EDA by job

ggplot(dat_long, aes(x = time, y = bprs, color = as.factor(job))) + 
  geom_jitter(alpha = 0.6) +
  scale_color_manual(
    values = grDevices::rainbow(length(unique(dat$job))),
    name = "Job"
  ) +
  theme_minimal() + 
  labs(
    title = 'Individual BPRS Scores by Time and Job',
    x = 'Year',
    y = 'BPRS Score'
  )
## Warning: Removed 2551 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(dat_long_sample, aes(x = time, y = bprs, group = patid, color = as.factor(job))) +
  geom_line(alpha = 0.5, linewidth = 0.8) +
  scale_color_manual(
    values = grDevices::rainbow(length(unique(dat$job))),
    name = "Job"
  ) +
  theme_minimal() +
  labs(
    title = "Individual BPRS Profiles by Time and Job (Sample of 150 Patients)",
    x = "Year",
    y = "BPRS Score"
  )
## Warning: Removed 355 rows containing missing values or values outside the scale range
## (`geom_line()`).

ggplot(dat, aes(x = as.factor(job), y = age, fill = as.factor(job))) +
  geom_boxplot(alpha = 0.7, color = "black") +
  scale_fill_manual(
    values = grDevices::rainbow(length(unique(dat$job))),
    name = "Job"
  ) +
  theme_minimal(base_size = 14) +
  labs(
    title = "Age at Baseline by Job",
    x = "Job",
    y = "Age"
  ) +
  theme(
    legend.position = "none",
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

EDA by Adl

ggplot(dat_long, aes(x = time, y = bprs, color = as.factor(adl))) + 
  geom_jitter(alpha = 0.6) +
  scale_color_manual(
    values = grDevices::rainbow(length(unique(dat$adl))),
    name = "ADL"
  ) +
  theme_minimal() + 
  labs(
    title = 'Individual BPRS Scores by Time and ADL',
    x = 'Year',
    y = 'BPRS Score'
  )
## Warning: Removed 2551 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(dat_long_sample, aes(x = time, y = bprs, group = patid, color = as.factor(adl))) +
  geom_line(alpha = 0.5, linewidth = 0.8) +
  scale_color_manual(
    values = grDevices::rainbow(length(unique(dat$adl))),
    name = "ADL"
  ) +
  theme_minimal() +
  labs(
    title = "Individual BPRS Profiles by Time and ADL (Sample of 150 Patients)",
    x = "Year",
    y = "BPRS Score"
  )
## Warning: Removed 355 rows containing missing values or values outside the scale range
## (`geom_line()`).

ggplot(dat, aes(x = as.factor(adl), y = age, fill = as.factor(adl))) +
  geom_boxplot(alpha = 0.7, color = "black", outlier.size = 0.8) +
  scale_fill_manual(
    values = grDevices::rainbow(length(unique(dat$adl))),
    name = "ADL"
  ) +
  theme_minimal(base_size = 14) +
  labs(
    title = "Age at Baseline by ADL",
    x = "ADL Category",
    y = "Age"
  ) +
  theme(
    legend.position = "none",
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

## EDA by BMI

ggplot(dat_long, aes(x = time, y = bprs, color = bmi)) + 
  geom_jitter(alpha = 0.6) +
  scale_color_viridis_c(name = "BMI") +
  theme_minimal() + 
  labs(
    title = 'Individual BPRS Scores by Time and BMI',
    x = 'Year',
    y = 'BPRS Score'
  )
## Warning: Removed 2551 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(dat_long_sample, aes(x = time, y = bprs, group = patid, color = bmi)) +
  geom_line(alpha = 0.5, linewidth = 0.8) +
  scale_color_viridis_c(name = "BMI") +
  theme_minimal() +
  labs(
    title = "Individual BPRS Profiles by Time and BMI (Sample of 150 Patients)",
    x = "Year",
    y = "BPRS Score"
  )
## Warning: Removed 355 rows containing missing values or values outside the scale range
## (`geom_line()`).

dat <- dat %>%
  mutate(age_bin = cut(
    age,
    breaks = c(-Inf, 60,70, 80, Inf),
    labels = c("<60", "60-70", "70-80", "80+"),
    right = FALSE
  ))


ggplot(dat, aes(x = age_bin, y = bmi, fill = age_bin)) +
  geom_boxplot(alpha = 0.7, color = "black") +
  scale_fill_manual(values = c("#b3cde0", "#005b96", "#03396c","#011f4b")) +
  theme_minimal(base_size = 14) +
  labs(
    title = "BMI at Baseline by Age Group",
    x = "Age Group",
    y = "BMI"
  ) +
  theme(
    legend.position = "none",
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

Overall dropout rate

non_na_counts <- dat %>%
  summarise(across(starts_with("bprs"), ~ sum(!is.na(.x))))

plot_data <- non_na_counts %>%
  pivot_longer(
    cols = everything(),
    names_to = "measurement_occasion",
    values_to = "count"
  ) %>%
  mutate(year = as.numeric(gsub("bprs", "", measurement_occasion)))

ggplot(plot_data, aes(x = year, y = count)) +
  geom_col(fill = "steelblue", color = "black") + 
  geom_text(aes(label = count), vjust = -0.5) + 
  theme_minimal() +
  labs(
    title = "Number of Available Observations at Each Time Point",
    x = "Year",
    y = "Number of available observations"
  )

## Correlation between dropout rate and age and bprs at baseline

dat$NAs_1 <- as.numeric(is.na(dat$bprs1))
dat$NAs_2 <- as.numeric(is.na(dat$bprs2))
dat$NAs_3 <- as.numeric(is.na(dat$bprs3))
dat$NAs_4 <- as.numeric(is.na(dat$bprs4))
dat$NAs_5 <- as.numeric(is.na(dat$bprs5))
dat$NAs_6 <- as.numeric(is.na(dat$bprs6))

corr_data <- dat %>%
  select(age, bprs0, NAs_1:NAs_6)

cor_matrix <- cor(corr_data, use="pairwise.complete")

melted <- melt(cor_matrix)

ggplot(melted, aes(x=Var1, y=Var2, fill=value)) +
  geom_tile(color = "white", linewidth = 0.4) +  # thin white grid lines
  scale_fill_gradient2(
    low = "#4575b4", mid = "white", high = "#d73027", midpoint = 0,
    limits = c(-1, 1),
    name = "Correlation (-1 : 1)"
  ) +
  coord_fixed() + 
  labs(
    title = "Correlation Heatmap ",
    subtitle = "Number of Dropouts with Age and BPRS at Baseline",
    x = NULL, y = NULL
  ) +
  theme_minimal(base_size = 14) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 11, face = "bold"),
    axis.text.y = element_text(size = 11, face = "bold"),
    plot.title = element_text(size = 18, face = "bold", hjust = 0.5),
    plot.subtitle = element_text(size = 13, hjust = 0.5, color = "gray30"),
    panel.grid = element_blank(),
    legend.position = "right",
    legend.title = element_text(size = 12, face = "bold"),
    legend.text = element_text(size = 10)
  )

# Paired t-tests baseline (0) vs final (6)
t.test(dat$cdrsb0, dat$cdrsb6, paired = TRUE)
## 
##  Paired t-test
## 
## data:  dat$cdrsb0 and dat$cdrsb6
## t = -8.73, df = 510, p-value < 2.2e-16
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -6.681396 -4.226628
## sample estimates:
## mean difference 
##       -5.454012
t.test(dat$bprs0,  dat$bprs6,  paired = TRUE)
## 
##  Paired t-test
## 
## data:  dat$bprs0 and dat$bprs6
## t = -173.5, df = 510, p-value < 2.2e-16
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -42.78828 -41.83012
## sample estimates:
## mean difference 
##        -42.3092
t.test(dat$abpet0, dat$abpet6, paired = TRUE)
## 
##  Paired t-test
## 
## data:  dat$abpet0 and dat$abpet6
## t = -11.407, df = 510, p-value < 2.2e-16
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -0.2096247 -0.1480270
## sample estimates:
## mean difference 
##      -0.1788258
t.test(dat$taupet0,dat$taupet6,paired = TRUE)
## 
##  Paired t-test
## 
## data:  dat$taupet0 and dat$taupet6
## t = -9.1044, df = 510, p-value < 2.2e-16
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -0.13856669 -0.08937852
## sample estimates:
## mean difference 
##      -0.1139726

Preliminary OLS model

# We fit a standard linear model (OLS) ignoring the patient-specific correlations.
preliminary_ols_model <- lm(bprs ~ time_num + sex + age + wzc + adl, 
                             data = dat_long, na.action = na.exclude)


# The residuals represent the part of the BPRS score that is NOT explained by the average trend.
# In theory, this is what's left over for the random effects and residual error to explain.

dat_long$ols_residuals = residuals(preliminary_ols_model)


# If the random effects were just a random intercept (b_0i), these profiles should be flat but
# shifted up or down for each patient, with random noise.
# If there is a random slope (b_1i), we would expect to see individual slopes in these residual profiles.

ggplot(dat_long, aes(x = time_num, y = ols_residuals, group = patid)) +
  geom_line(alpha = 0.3) + 
  geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
  labs(title = "OLS Residual Profiles for Each Patient",
       x = "Year (Numeric)",
       y = "OLS Residuals") +
  theme_minimal()
## Warning: Removed 2551 rows containing missing values or values outside the scale range
## (`geom_line()`).

Conclusion

Summarize your conclusions. What are the implications with respect to statistical modeling ?

From our exploratory data analysis, we observe that each patient’s clinical and neuroimaging features were measured up to six times, corresponding to annual follow-ups over six years. The variable of interest, BPRS, which measures the severity of psychiatric symptoms, shows a linear trend over time. The variance appears to remain more or less constant across the years.

Among the four measures, BPRS shows by far the greatest change across time. This indicates that behavioral and psychiatric deterioration dominates the longitudinal trajectory, whereas cognitive decline (CDRSB) is more moderate, and biological markers (ABPET, TAUPET) show much smaller absolute shifts. All variables show statistically significant changes, but their effect sizes differ dramatically, emphasizing that statistical significance does not necessarily imply clinical importance*

*(Matteo: I am not sure anymore about this sentence, because we have no idea how clinically relevant is a big change in one feature. As in, maybe a smaller absolute change in ABPET has much stronger clinical effects than a bigger absolute change in BPRS).

The correlation matrix indicates that BPRS at baseline is positively correlated with age, WZC, and other clinical features, while it is negatively correlated with job, ADL, trial, and income. However, further exploration suggests that age is likely the main driver of variation in BPRS.

A spurious association is observed between sex and BPRS: women appear to have higher BPRS scores than men, but this can be explained by their higher mean age at baseline. Similar spurious relationships are found for WZC, job, ADL, and trial, all of which are confounded by age.

The longitudinal plots confirm a positive, linear association between age and BPRS. There is also considerable attrition over time, with the number of observations decreasing from 1,253 at baseline to 511 at year six. Approximately 63% of the dropouts are women, likely due to their higher baseline age, which may have led to faster progression of psychiatric symptoms or inability to continue participation. It is plausible that participants with high baseline BPRS scores either died or became too impaired to remain in the study.

This pattern of dropout might explain why the overall variance of BPRS appears constant over time. Although sample size decreases, the reduction in the range of BPRS values among remaining participants likely counterbalances the expected increase in variance.

Based on these findings, a linear mixed-effects model is the most appropriate statistical approach for analyzing this longitudinal dataset. Time and age should be included as fixed effects, with patient ID modeled as a random effect to account for repeated measures within individuals. Since patients are also nested within trials, trial can be included as an additional random effect. The residuals extracted from the OLS model follow this formula: \(r_i = y_i - X_i\hat{\beta}_{OLS} \approx Z_ib_i + \epsilon_i\), which implies that after taking into account the fixed effects, any variance left is the sum of subject-specific effects plus any residual variance. From the spaghetti plot of the residuals we can confirm that different subjects need different intercepts. This plot also reveals the need of subject-specific slopes, as some subjects’ bprs score increase faster or slower than the average over time. The fact that the residuals cluster around 0 shows that the distributional assumption \(E \left( \begin{pmatrix} b_{0i} \\ b_{1i} \end{pmatrix} \right) = \begin{pmatrix} 0 \\ 0 \end{pmatrix}\) is met.

Finally, given that the data are collected at regular time intervals, it is reasonable to assume an autoregressive correlation structure for the within-subject residuals, capturing the expected correlation between adjacent time points (plot a semivariogram once the linear mixed model is fitted).